まえがき

ggplotを使うときの注意点,複数の図に分割する方法やデータを計算・要約する方法などを扱う

4.1 文法としては正しいが意味をなさない

表示されても,記述した内容を確認する必要がある.

4.2 グループ別データに対応する審美的要素

# 国別の時系列データを描きたい
p <- ggplot(data = gapminder,
            mapping = aes(x = year, y = gdpPercap))
p + geom_line()

# ミスが起きても気にしない

# 国別の時系列データ
# 明示的にコードを書く必要がある
# 大きな外れ値はクウェート
p <- ggplot(data = gapminder,
            mapping = aes(x = year, y = gdpPercap))
p + geom_line(mapping = aes(group = country))

# 大陸ごとに色分けした,国別のgdpPercapの時系列データ
p <-  ggplot(data = gapminder,
           mapping = aes(x = year, y = gdpPercap, color = continent))
p + geom_line(mapping = aes(group = country))

4.3 複数の図を並べるためのfacet_関数群

facet_関数群は対象の変数ごとに図を切り出して,パネル形式で出力するための関数群.

# facet_wrap()関数を使って,大陸ごとに層別化された国の時系列データを一枚の図に納める.
p <- ggplot(data = gapminder,
            mapping = aes(x = year,
                          y = gdpPercap))
p + geom_line(mapping = aes(group = country)) + 
    facet_wrap( ~ continent, ncol = 3)

# 黒い線は見にくいため変更する
# その他色々変更する
p <- ggplot(data = gapminder,
            mapping = aes(x = year, y = gdpPercap))
p + geom_line(color = "gray", mapping = aes(group = country)) + 
    geom_smooth(size = 1.1, method = "loess", se = FALSE) + 
    scale_y_log10(labels = scales::dollar) + 
    facet_wrap( ~ continent, ncol = 3) + 
    labs(x = "year", y = "log GDP per capita", 
         title = "GDP per capita on Five Continents")
## `geom_smooth()` using formula 'y ~ x'

# データを2種類のカテゴリ変数に基づいて相互に分類したい
# 新たなデータセットgss_smを用いる
# gss_sm: 米国の成人を対象とした社会科学者が興味のあるトピックについての質問調査票
# socvizパッケージにある
glimpse(gss_sm) # 変数の概要を確認
## Rows: 2,867
## Columns: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ ballot      <labelled> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3, 3, …
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32, 60, …
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6, 5, 0,…
## $ sibs        <labelled> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1, 3, …
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Graduate, …
## $ race        <fct> White, White, White, White, White, White, White, Other, …
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Female, …
## $ region      <fct> New England, New England, New England, New England, New …
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999, $170…
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, Cathol…
## $ marital     <fct> Married, Never Married, Married, Married, Married, Marri…
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelor, NA,…
## $ madeg       <fct> High School, High School, Lt High School, High School, H…
## $ partyid     <fct> "Independent", "Ind,near Dem", "Not Str Republican", "No…
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightly Libe…
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Happy, Ve…
## $ partners    <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner, NA, …
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Legal, N…
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpio, Cap…
## $ pres12      <labelled> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1, 2, 1…
## $ wtssall     <dbl> 0.9569935, 0.4784968, 0.9569935, 1.9139870, 1.4354903, 0…
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $170000…
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-55, Age…
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-62, Age…
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, 3, 6+,…
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+, 2, 4+…
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, Cathol…
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northeast, N…
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0, 1, 0,…
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, 1, 0, …
p <- ggplot(data = gss_sm, 
            mapping = aes(x = age, y = childs))
# 縦方向に性別(sex), 横方向に人種(race)を層別した年齢別の子供の数
p + geom_point(alpha = 0.2) + 
    geom_smooth() + 
    facet_grid(sex ~ race)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 18 rows containing non-finite values (stat_smooth).
## Warning: Removed 18 rows containing missing values (geom_point).

4.4 geomによるデータの変換

geom_()関数はstat_関数と紐づけられている

# 棒グラフ
# geom_bar()の中でstat_count()関数により算出される
p <- ggplot(data = gss_sm, mapping = aes(x = bigregion, fill = bigregion)) # colorだと縁だけしか変わらない
p + geom_bar()

# 割合グラフを作りたい場合
# しかしこれだと全て1になる
p <- ggplot(data = gss_sm, mapping = aes(x = bigregion, fill = bigregion))
p + geom_bar(mapping = aes(y = after_stat(prop))) 

# ダミーグループとして1を与える.
# その場合,fillによる色分けは無効になっている(∵ グループが優先されるから?)
p <- ggplot(data = gss_sm, mapping = aes(x = bigregion, fill = bigregion))
p + geom_bar(mapping = aes(y = after_stat(prop), group = 1))

# 各信仰がどれくらいの人がいるか
table(gss_sm$religion)
## 
## Protestant   Catholic     Jewish       None      Other 
##       1371        649         51        619        159
# これだと枠だけが色分けされる
p <- ggplot(data = gss_sm, mapping = aes(x = religion, color = religion))
p + geom_bar() +
    guides(fill = "none") # これだと判例はなくならない

p <- ggplot(data = gss_sm, mapping = aes(x = religion, fill = religion))
p + geom_bar() + 
    guides(fill = "none") # これで判例をなくすことが出来る

# geom_bar(mapping = aes(fill = religion))
# としても同じように出る

4.5 回りくどく度数分布表を描いてみる

# 地域(bigregion)ごとの信仰(religion)の人の数
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          fill = religion))
p + geom_bar()

# しかしこのグラフは比較が困難(e.g. Catholicが分かりくい)
# 比率のグラフにしよう

p <- ggplot(data = gss_sm, 
            mapping = aes(x = bigregion,
                          fill = religion))
p + geom_bar(position = "fill") # このfillは審美的要素ではない

# しかしこのグラフでは相対的な大きさを評価出来ない
# 各地域内の信仰の割合を比較することは可能
p <- ggplot(data = gss_sm, 
            mapping = aes(x = bigregion, 
                          fill = religion))
p + geom_bar(position = "dodge") # position = "dodge"とすることで横並びになる

# あとは割合になるようにafter_stat(prop)を用いる
# その際にはグループをグループ化したい変数(religionで指定する)
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, fill = religion))
p + geom_bar(position = "dodge",
             mapping = aes(y = after_stat(prop), group = religion))

# このグラフは各地域ごとの割合の合計が1となるのではなく,
# それぞれの信仰の合計が1とした各地域別の信仰の割合となる
# つまり,信仰をJweishと答えた人の5割は北東部に住んでいることが分かる

# しかしながら,これも当初作りたかったグラフではない
#  それぞれの項目を横並びにして,それぞれの高さを比較したい
# facet_warpを使って地域を層別する
p <- ggplot(data = gss_sm, 
            mapping = aes(x = religion))
p + geom_bar(position = "dodge",
             mapping = aes(y = after_stat(prop), group = bigregion)) + 
    facet_wrap( ~ bigregion, ncol = 2)

頻度をプロットする場合,ggplotで全てを行おうとすると沼にハマりがち.dplyrパッケージを使った方が信頼性が高く,エラーの確認も容易な方法になる.それは第5章で述べる.

4.6 ヒストグラムと密度プロット

連続量をヒストグラムとして描く時は,binを指定する必要がある(デフォルトはbins = 30).ヒストグラムのbinは注意が必要

# midwestデータを用いる
glimpse(midwest)
## Rows: 437
## Columns: 28
## $ PID                  <int> 561, 562, 563, 564, 565, 566, 567, 568, 569, 57…
## $ county               <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN",…
## $ state                <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL",…
## $ area                 <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.017…
## $ poptotal             <int> 66090, 10626, 14991, 30806, 5836, 35688, 5322, …
## $ popdensity           <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324.2…
## $ popwhite             <int> 63917, 7054, 14477, 29344, 5264, 35157, 5298, 1…
## $ popblack             <int> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 1655…
## $ popamerindian        <int> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26, …
## $ popasian             <int> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89,…
## $ popother             <int> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, 7…
## $ percwhite            <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.1987…
## $ percblack            <dbl> 2.57527614, 32.90043290, 2.86171703, 0.41225735…
## $ percamerindan        <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156,…
## $ percasian            <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813,…
## $ percother            <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169,…
## $ popadults            <int> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 11…
## $ perchsd              <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.8615…
## $ percollege           <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.4760…
## $ percprof             <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.36768…
## $ poppovertyknown      <int> 63628, 10529, 14235, 30337, 4815, 35107, 5241, …
## $ percpovertyknown     <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.5051…
## $ percbelowpoverty     <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13.5…
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13.…
## $ percadultpoverty     <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11.1…
## $ percelderlypoverty   <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19.2…
## $ inmetro              <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,…
## $ category             <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR"…
p <- ggplot(data = midwest, 
            mapping = aes(x = area))
p + geom_histogram(bins = 10) # bins: binの数

p + geom_histogram(binwidth = 0.01) # binwidth: binの幅

# 複数のヒストグラムをまとめて図示できる.
oh_wi <- c("OH", "WI") # OH と WI の2州だけを取り出す
p <- ggplot(data = subset(midwest, subset = state %in% oh_wi),
            mapping = aes(x = percollege, fill = state))
p + geom_histogram(alpha = 0.4, bins = 20)

# カーネル密度推定を使うのも手
p <- ggplot(data = midwest, 
            mapping = aes(x = area))
p + geom_density()

# 州で色分け
# fillは密度曲線の本体に対して,colorは線に対して効果を示す
p <- ggplot(data = midwest,
            mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.1)

# 重なって見にくい場合,geom_line(stat = "density")を使うことで線だけにできる
p <- ggplot(data = midwest, 
            mapping = aes(x = area, color = state))
p + geom_line(stat = "density")

# geom_bar()のように,geom_histgram()やgeom_density()でも相対値を求めることも可能
p <- ggplot(data = midwest, 
            mapping = aes(x = area, color = state, fill = state))
p + geom_density(alpha = 0.3, 
                 mapping = aes(y = after_stat(scaled)))

# countは密度とデータ点の数をかけ合わせた統計量を返す
p <- ggplot(data = midwest, 
            mapping = aes(x = area, color = state, fill = state))
p + geom_density(alpha = 0.3, 
                 mapping = aes(y = after_stat(count)))

4.7 不要な変換を避ける

得られたデータがすでに要約されている場合など,変換を避けたい場合はstat = “identity”と記述する

# titanic: タイタニック号の生存者に関するデータ
titanic
##       fate    sex    n percent
## 1 perished   male 1364    62.0
## 2 perished female  126     5.7
## 3 survived   male  367    16.7
## 4 survived female  344    15.6
# タイタニックデータにおける性別ごとの生存者割合
p <- ggplot(data = titanic,
            mapping = aes(x = fate, y = percent, fill = sex))
p + geom_bar(position = "dodge", stat = "identity") +  # 対象となる変数に対して変換しない場合はstat = "identity"とする
    theme(legend.position = "top")

# 代わりに,geom_colを使える
p + geom_col(position = "dodge")

# oecd_sum: アメリカおよびOECD諸国の出生時の平均寿命に関連する情報
# otherカラムはアメリカ合衆国以外の平均寿命
# socvizパッケージにある
glimpse(oecd_sum)
## Rows: 57
## Columns: 5
## Groups: year [57]
## $ year  <int> 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 19…
## $ other <dbl> 68.6, 69.2, 68.9, 69.1, 69.5, 69.6, 69.9, 70.1, 70.1, 70.1, 69…
## $ usa   <dbl> 69.9, 70.4, 70.2, 70.0, 70.3, 70.3, 70.3, 70.7, 70.4, 70.6, 70…
## $ diff  <dbl> 1.3, 1.2, 1.3, 0.9, 0.8, 0.7, 0.4, 0.6, 0.3, 0.5, 1.1, 0.8, 0.…
## $ hi_lo <chr> "Below", "Below", "Below", "Below", "Below", "Below", "Below",…
oecd_sum
## # A tibble: 57 x 5
## # Groups:   year [57]
##     year other   usa  diff hi_lo
##    <int> <dbl> <dbl> <dbl> <chr>
##  1  1960  68.6  69.9 1.3   Below
##  2  1961  69.2  70.4 1.2   Below
##  3  1962  68.9  70.2 1.30  Below
##  4  1963  69.1  70   0.9   Below
##  5  1964  69.5  70.3 0.800 Below
##  6  1965  69.6  70.3 0.7   Below
##  7  1966  69.9  70.3 0.400 Below
##  8  1967  70.1  70.7 0.6   Below
##  9  1968  70.1  70.4 0.3   Below
## 10  1969  70.1  70.6 0.5   Below
## # … with 47 more rows
p <- ggplot(data = oecd_sum,
            mapping = aes(x = year, y = diff, fill = hi_lo))
p + geom_col() + 
    guides(fill = "none") + # 凡例を消す
    labs(x = NULL, y = "Different in years", 
         title = "The US Life Expectancy Gap",
         subtitle = "Dofference between US and OECD average life expectancy, 1960-2015",
         caption = "Data: OECD. After a chart by Christpher Ingraham, Washington Post, December 27th 2017"
         )
## Warning: Removed 1 rows containing missing values (position_stack).

4.8 次の一手

# gapminderデータセットに対して,色々な変数に対して,facetによる層別化を試す
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghan…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia…
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997…
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, …
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134…
unique(gapminder$country)
##   [1] Afghanistan              Albania                  Algeria                 
##   [4] Angola                   Argentina                Australia               
##   [7] Austria                  Bahrain                  Bangladesh              
##  [10] Belgium                  Benin                    Bolivia                 
##  [13] Bosnia and Herzegovina   Botswana                 Brazil                  
##  [16] Bulgaria                 Burkina Faso             Burundi                 
##  [19] Cambodia                 Cameroon                 Canada                  
##  [22] Central African Republic Chad                     Chile                   
##  [25] China                    Colombia                 Comoros                 
##  [28] Congo, Dem. Rep.         Congo, Rep.              Costa Rica              
##  [31] Cote d'Ivoire            Croatia                  Cuba                    
##  [34] Czech Republic           Denmark                  Djibouti                
##  [37] Dominican Republic       Ecuador                  Egypt                   
##  [40] El Salvador              Equatorial Guinea        Eritrea                 
##  [43] Ethiopia                 Finland                  France                  
##  [46] Gabon                    Gambia                   Germany                 
##  [49] Ghana                    Greece                   Guatemala               
##  [52] Guinea                   Guinea-Bissau            Haiti                   
##  [55] Honduras                 Hong Kong, China         Hungary                 
##  [58] Iceland                  India                    Indonesia               
##  [61] Iran                     Iraq                     Ireland                 
##  [64] Israel                   Italy                    Jamaica                 
##  [67] Japan                    Jordan                   Kenya                   
##  [70] Korea, Dem. Rep.         Korea, Rep.              Kuwait                  
##  [73] Lebanon                  Lesotho                  Liberia                 
##  [76] Libya                    Madagascar               Malawi                  
##  [79] Malaysia                 Mali                     Mauritania              
##  [82] Mauritius                Mexico                   Mongolia                
##  [85] Montenegro               Morocco                  Mozambique              
##  [88] Myanmar                  Namibia                  Nepal                   
##  [91] Netherlands              New Zealand              Nicaragua               
##  [94] Niger                    Nigeria                  Norway                  
##  [97] Oman                     Pakistan                 Panama                  
## [100] Paraguay                 Peru                     Philippines             
## [103] Poland                   Portugal                 Puerto Rico             
## [106] Reunion                  Romania                  Rwanda                  
## [109] Sao Tome and Principe    Saudi Arabia             Senegal                 
## [112] Serbia                   Sierra Leone             Singapore               
## [115] Slovak Republic          Slovenia                 Somalia                 
## [118] South Africa             Spain                    Sri Lanka               
## [121] Sudan                    Swaziland                Sweden                  
## [124] Switzerland              Syria                    Taiwan                  
## [127] Tanzania                 Thailand                 Togo                    
## [130] Trinidad and Tobago      Tunisia                  Turkey                  
## [133] Uganda                   United Kingdom           United States           
## [136] Uruguay                  Venezuela                Vietnam                 
## [139] West Bank and Gaza       Yemen, Rep.              Zambia                  
## [142] Zimbabwe                
## 142 Levels: Afghanistan Albania Algeria Angola Argentina Australia ... Zimbabwe
unique(gapminder$year)
##  [1] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
# 年代を層別したの人口とGDPの散布図
p <- ggplot(data = gapminder, 
            mapping = aes(x = pop, y = gdpPercap, fill = year))
p.out <- p + geom_point() + 
             facet_wrap(~ year, ncol = 2) + 
             guides(fill = "none")
ggsave(filename = "pop_gdp_facet_year.pdf", plot = p.out)
## Saving 7 x 5 in image
# 
p <- ggplot(data = gapminder, 
            mapping = aes(x = pop, y = gdpPercap, color = year))
p.out <- p + geom_point() + 
             scale_x_log10() + 
             facet_wrap(~ country, ncol = 4)
ggsave(filename = "pop_gdp_year_facet_country.jpg", plot = p.out, 
       height = 40, width = 20, units = "in")

# facet_gridとfacet_wrapについて考察する
# gss_smデータセットを用いる
# xはage,yはchildsで,sexとraceで層別
glimpse(gss_sm)
## Rows: 2,867
## Columns: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ ballot      <labelled> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3, 3, …
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32, 60, …
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6, 5, 0,…
## $ sibs        <labelled> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1, 3, …
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Graduate, …
## $ race        <fct> White, White, White, White, White, White, White, Other, …
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Female, …
## $ region      <fct> New England, New England, New England, New England, New …
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999, $170…
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, Cathol…
## $ marital     <fct> Married, Never Married, Married, Married, Married, Marri…
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelor, NA,…
## $ madeg       <fct> High School, High School, Lt High School, High School, H…
## $ partyid     <fct> "Independent", "Ind,near Dem", "Not Str Republican", "No…
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightly Libe…
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Happy, Ve…
## $ partners    <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner, NA, …
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Legal, N…
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpio, Cap…
## $ pres12      <labelled> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1, 2, 1…
## $ wtssall     <dbl> 0.9569935, 0.4784968, 0.9569935, 1.9139870, 1.4354903, 0…
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $170000…
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-55, Age…
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-62, Age…
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, 3, 6+,…
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+, 2, 4+…
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, Cathol…
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northeast, N…
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0, 1, 0,…
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, 1, 0, …
colnames(gss_sm)
##  [1] "year"        "id"          "ballot"      "age"         "childs"     
##  [6] "sibs"        "degree"      "race"        "sex"         "region"     
## [11] "income16"    "relig"       "marital"     "padeg"       "madeg"      
## [16] "partyid"     "polviews"    "happy"       "partners"    "grass"      
## [21] "zodiac"      "pres12"      "wtssall"     "income_rc"   "agegrp"     
## [26] "ageq"        "siblings"    "kids"        "religion"    "bigregion"  
## [31] "partners_rc" "obama"
p <- ggplot(data = gss_sm, 
            mapping = aes(x = age, y = childs))
p1 <- p + geom_point() + 
          facet_grid(sex ~ race) + 
          labs(title = "facet_grid: sex ~ race")
p2 <- p + geom_point() + 
          facet_grid(~ sex + race) + 
          labs(title = "facet_grid: ~ sex + race")
p3 <- p + geom_point() + 
          facet_wrap(~ sex + race) + 
          labs(title = "facet_wrap: ~sex + race")

# 見やすいように一つのグラフにまとめる
gridExtra::grid.arrange(p1, p2, p3)
## Warning: Removed 18 rows containing missing values (geom_point).

## Warning: Removed 18 rows containing missing values (geom_point).

## Warning: Removed 18 rows containing missing values (geom_point).

p.all <- ggpubr::ggarrange(p1, p2, p3) # ggpubr::ggarrange()は複数のプロットを一枚のプロットにする方法
## Warning: Removed 18 rows containing missing values (geom_point).

## Warning: Removed 18 rows containing missing values (geom_point).

## Warning: Removed 18 rows containing missing values (geom_point).
ggsave(filename="comparison_facet.pdf", plot = p.all,
       height = 10, width = 10, units = "in")

# 累積折れ線グラフ
# 観測数を棒で表示する代わりに連続した線で表示するもの
colnames(midwest)
##  [1] "PID"                  "county"               "state"               
##  [4] "area"                 "poptotal"             "popdensity"          
##  [7] "popwhite"             "popblack"             "popamerindian"       
## [10] "popasian"             "popother"             "percwhite"           
## [13] "percblack"            "percamerindan"        "percasian"           
## [16] "percother"            "popadults"            "perchsd"             
## [19] "percollege"           "percprof"             "poppovertyknown"     
## [22] "percpovertyknown"     "percbelowpoverty"     "percchildbelowpovert"
## [25] "percadultpoverty"     "percelderlypoverty"   "inmetro"             
## [28] "category"
p <- ggplot(data = midwest,
            mapping = aes(x = area))
p1 <- p + geom_histogram(binwidth = 0.01)
p2 <- p + geom_freqpoly(binwidth = 0.01)
p.all <- ggpubr::ggarrange(p1, p2)
p.all

ggsave(filename = "histgram_freqpoly.png", plot = p.all,
       height = 10, width = 10, units = "in")

# 密度推定
p <- ggplot(data = midwest,
            mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.1)

p <- ggplot(data = midwest, 
            mapping = aes(x = percollege, y = percbelowpoverty))
p + geom_point(alpha = 0.2) + 
    geom_density2d()

p + geom_density2d()

p + geom_density_2d_filled()